BREAST CANCER ANALYSIS

Emile Cohen

July 2020

Goal: In this notebook, we want to understand what makes Breast Cancer a textbook case for the patterns we saw, and what are the major subcohorts that drive the signal.


In [2]:
%run -i '../../../../../utils/setup_environment.ipy'

import warnings
warnings.filterwarnings('ignore')
from scipy.stats import fisher_exact, ranksums, chi2, norm
from statsmodels.sandbox.stats.multicomp import multipletests
import matplotlib.gridspec as gridspec
import pickle

data_path = '../../../../../data/'
data_wgd = data_path + 'impact-facets-tp53/processed/wgd/'
data_no_wgd = data_path + 'impact-facets-tp53/processed/no_wgd/'
Setup environment... done!

✅ Working on **mskimpact_env** conda environment.

Interesting functions

In [3]:
from functools import reduce

def get_hotspots(df: pd.DataFrame, Sample_Type: str, group: list = None, group_type:str = None):
    data = df[df['Sample_Type'] == Sample_Type]
    
    if group and group_type:
        data = data[data[group_type].isin(group)]

    data_1 = get_groupby(data,'tp53_spot_1', 'count'); data_2 = get_groupby(data,'tp53_spot_2', 'count'); data_3 = get_groupby(data,'tp53_spot_3', 'count') ; data_4 = get_groupby(data,'tp53_spot_4', 'count') ; data_5 = get_groupby(data,'tp53_spot_5', 'count') 
    series_data = [data_1,data_2,data_3,data_4,data_5]

    df_merged = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True,
                                                how='outer'), series_data).fillna(0)

    df_merged.columns = ['count_1', 'count_2', 'count_3', 'count_4', 'count_5']
    df_merged['total'] = df_merged.sum(axis=1)
    df_merged = df_merged.sort_values(by='total', ascending=False)

    df_merged = df_merged.drop('nan')
    
    return df_merged

def get_hotspot_frac(df: pd.DataFrame, group_type:str = None, group: list = None, nb = 10):
    if group_type and group:
        df = df[df[group_type].isin(group)]
    result = [['spot', '#', 'frac']]
    for spot in get_groupby(df, 'tp53_spot_1', 'count').sort_values(by='count', ascending=False).head(nb).index.tolist():
        result.append([spot,df[df['tp53_spot_1'] == spot].frac_genome_altered.shape[0], df[df['tp53_spot_1'] == spot].frac_genome_altered.median()])

    return pd.DataFrame(result)


def boxplot_sampletype(df: pd.DataFrame, group:str, palette, order, metrics: str, figsize= (10,3), title: str = '', title_font: int=12, xlim=[0,1]):
    fig=plt.figure(figsize=figsize)
    ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)

    sns.boxplot(y=metrics, x=group,data=df,ax=ax, dodge=False,order=order, palette=palette).set_title(title, weight='bold', fontsize=title_font)
    
    groupby_ = get_groupby(df,group, 'count')
    
    labels = []
    for element in order:
        labels.append(element + '\n('+ str(groupby_.loc[element].values[0])+')')
    
    
    
    ax.set_xticklabels(labels)
    style(ax)
    ax.set_ylim(xlim)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    return fig, ax


# Let's give a look at medians and statistics

def get_statistics(df: pd.DataFrame, group:str, metrics: str, group_list: list):
    group_1 = df[df[group] == group_list[0]][metrics]
    group_2 = df[df[group] == group_list[1]][metrics]

    median_1 = group_1.median()
    median_2 = group_2.median()

    statistic, p_value = ranksums(group_1.dropna().values,group_2.dropna().values)
    
    results = [['', 'size', metrics],
               [group_list[0], group_1.shape[0], median_1],
               [group_list[1], group_2.shape[0], median_2],
               ['', 'Statistics', 'p-value'],
               ['', statistic, p_value]]


    return pd.DataFrame(results)


def get_major_codrivers(master: pd.DataFrame, maf: pd.DataFrame, head:int = 10, tp53=False):
    samples = master.Tumor_Id.tolist()
    if tp53:
        maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True]
    else:
        maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True][maf['Hugo_Symbol'] != 'TP53']
    h = pd.DataFrame(maf_filtered[['Hugo_Symbol']].groupby(['Hugo_Symbol']).size())
    h.columns = ['count']
    h = h.sort_values(by='count', ascending=False).head(head)
    
    return(h)


def create_co_drivers_table(master: pd.DataFrame, group_type:str,  group_1: str, group_2: str):
    master_group_1 = master[master[group_type] == group_1]
    co_drivers_group_1 = get_major_codrivers(master=master_group_1,
                        maf=maf_cohort_nowgd,
                       head=100)
    co_drivers_group_1['proportion_1'] = co_drivers_group_1.apply(lambda x: 100* round(x['count'] / co_drivers_group_1.sum().values[0], 4), axis=1)
    
    master_group_2 = master[master[group_type] == group_2]
    co_drivers_group_2 = get_major_codrivers(master=master_group_2,
                        maf=maf_cohort_nowgd,
                       head=100)
    co_drivers_group_2['proportion_2'] = co_drivers_group_2.apply(lambda x: 100* round(x['count'] / co_drivers_group_2.sum().values[0], 4), axis=1)
    
    co_drivers_groups = pd.merge(co_drivers_group_1, co_drivers_group_2, on='Hugo_Symbol')
    co_drivers_groups['proportion_1'] = - co_drivers_groups['proportion_1']
    
    return co_drivers_groups

Master Definition and Filtering

In [4]:
cancer = 'Breast Cancer'
In [5]:
master_no_wgd = non_wgd_load_and_cut(data_path + 'impact-facets-tp53/processed/no_wgd/master_no_wgd.pkl')
master_wgd = pd.read_pickle(data_path + 'impact-facets-tp53/processed/wgd/master_wgd.pkl')

master_no_wgd_cancer = master_no_wgd[master_no_wgd['Cancer_Type'] == cancer]
master_wgd_cancer = master_wgd[master_wgd['Cancer_Type'] == cancer]

maf_cohort_nowgd = pd.read_csv(data_path + 'impact-facets-tp53/processed/no_wgd/maf_cohort_nowgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
maf_cohort_wgd = pd.read_csv(data_path + 'impact-facets-tp53/processed/wgd/maf_cohort_wgd.txt', sep='\t').drop('Unnamed: 0', axis=1)

What makes Breast Cancer a TextBook Case?

WGD Proportion

Breast Cancer is the biggest cancer in our cohort. Breast Cancer has a slightly low proportion of WGD - around 30%

Cancer Panel

  • Breast Cancer is one of the most represented cancer in MSK-Impact Cohort.
  • Highly enriched in LOSS Samples: 0_HETLOSS and >=1_LOSS account for more than 90% of the cancer cohort
  • Depleted in >=1_cnLOH, 1_WILD_TYPE

Genome Instability

Breast Cancer shows a significant difference in Genome Instability between TP53 Mono-Allelic and Bi-Allelic subgroups - and has a lot of samples in both groups.

In the TP53 subgroup Pan Cancer plot that follows, we can see 2 important signals:

  • 1_WILD_TYPE and 0_HETLOSS GI are very low compared to other subgroups
  • Bi Allelic Subgroups - >=1_LOSS and >=1_cnLOH - have higher GI than other subgroups and the difference is significant

WGD Part

Subgroup Proportion

In the following cells are the proportions of different groupo levels: on the right Primary samples, on the left Metastatic samples.

Key points:

  • Equality of proportion between Pre WGD TP53 Bi-Allelic / Pre WGD TP53 Residual samples
  • Enrichment in 0 tp53 mut / TP53 LOH samples - in sync with 0_HETLOSS enrichment in Non WGD Cohort
  • Clear enrichment for TP53 LOH and TP53 mutation samples

Very High Genome Instability

In WGD cohort, Genome Instability median is above 70% for all cancer types.

We still see difference between TP53 bi-allelic and mono-allelic states but those are not very significant:

No WGD Part - Cancer Investigation

In this section, our goal is to find subcohorts that lead the signals observed. Here are the different subcohort we will create:

  • Hotspot Analysis: splitting on 273 / 248 / 175 / Missense / Truncated / In Frame
  • CCF Analysis
  • SNV/INDEL Analysis

Hotspot Analysis

In this section, we cut our cohort to only keep samples with exactly one TP53 mutation, for simplicity.

In [6]:
master_hotspot = master_no_wgd_cancer[master_no_wgd_cancer['tp53_count'] == 1]
In [7]:
get_hotspot_frac(df=master_hotspot,
                group_type=None,
                group=None)
Out[7]:
0 1 2
0 spot # frac
1 nan 43 0.339
2 248 31 0.372
3 273 17 0.259
4 175 16 0.4385
5 342 14 0.3735
6 245 13 0.29
7 176 11 0.332
8 163 10 0.418
9 220 10 0.365
10 285 9 0.272

Entire Cohort

In [8]:
h = get_groupby(master_hotspot,'tp53_vc_group_1', 'count').sort_values(by='count', ascending=False)
display(h)

h = h.T
h = h[mutation_list]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax, colormap="Accent")
ax.legend(['In Frame', 'Truncated', 'Missense', 'Hotspot 248','Hotspot 273','Hotspot 175', 'Other Hotspot'],loc='center left', bbox_to_anchor=(1.1, 0.5), fontsize=11)
ax.set_title('Mutation Type - {} - No WGD'.format(cancer), weight='bold', fontsize=18)

plt.show()
count
tp53_vc_group_1
truncated 243
missense 215
248 31
hotspot 31
in_frame 25
175 16
273 15
In [9]:
fig, ax = boxplot_sampletype(df=master_hotspot,
                  group='tp53_vc_group_1',
                  palette=mutation_palette,
                  order=mutation_list,
                  metrics='frac_genome_altered',
                  figsize=(6,10),
                  title='Fraction of Genome Altered - {}'.format(cancer),
                  xlim=[0,1])
plt.show()

TP53 Residual Subgroups

In [10]:
print('Number of Bi Allelic samples (with 1 mut): ' + str(master_hotspot[master_hotspot['tp53_res_group'] == 'no_tp53_res'].shape[0]))
print('')
print('Number of  TP53 Residual samples (with 1 mut): ' + str(master_hotspot[master_hotspot['tp53_res_group'] == 'tp53_res'].shape[0]))
Number of Bi Allelic samples (with 1 mut): 525

Number of  TP53 Residual samples (with 1 mut): 23
In [13]:
total_df = []
for group in ['no_tp53_res']:
    h = get_groupby(master_hotspot[master_hotspot['tp53_res_group'] == group], 'tp53_vc_group_1', group).sort_values(by=group, ascending=False)
    total_df.append(h)
    
    h = h.T
    h = h[mutation_list]
    fig = plt.figure(figsize=(6,1))
    ax = plt.subplot()

    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax, colormap="Accent")
    if group == 'tp53_res':
        ax.legend(['In Frame', 'Truncated', 'Missense', 'Hotspot 248','Hotspot 273','Hotspot 175', 'Other Hotspot'],loc='center left', bbox_to_anchor=(1.05, 0.5), fontsize=11)
    else: ax.get_legend().remove()
    ax.set_title('Mutation Type - {} - No WGD'.format(group), weight='bold', fontsize=18)

    plt.show()

display_side_by_side(total_df[0])
no_tp53_res
tp53_vc_group_1
truncated 224
missense 192
248 29
hotspot 29
in_frame 22
273 15
175 14
In [16]:
for group in ['no_tp53_res']:
    master_wt = master_hotspot[master_hotspot['tp53_res_group'] == group]

    fig, ax = boxplot_sampletype(df=master_wt,
                      group='tp53_vc_group_1',
                      palette=mutation_palette,
                      order=mutation_list,
                      metrics='frac_genome_altered',
                      figsize=(6,10),
                      title='Fraction of Genome Altered - No WGD - {} subgroup'.format(group),
                      xlim=[0,1])
    plt.show()

SNV / INDEL Analysis

In this section we compare SNV and INDEL mutations. As in the previous section, we cut the cohort to keep only samples with exactly 1 tp53 mutation.

# of Drivers / SNV Drivers / INDEL Drivers

In [17]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='driver_mutation_count',
                  figsize=(8,12),
                  title='Driver Mutation Count - TP53 Subroups - No WGD',
                  xlim=[-0.1,50])
plt.show()

display_side_by_side(get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='driver_mutation_count', 
               group_list=['1_WILD_TYPE', '0_HETLOSS']),
       
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='driver_mutation_count', 
               group_list=['1_WILD_TYPE', '>=1_LOSS']),
        
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='driver_mutation_count', 
               group_list=['>1muts', '>=1_LOSS']))
0 1 2
0 size driver_mutation_count
1 1_WILD_TYPE 23 1
2 0_HETLOSS 784 2
3 Statistics p-value
4 -2.96491 0.00302767
0 1 2
0 size driver_mutation_count
1 1_WILD_TYPE 23 1
2 >=1_LOSS 496 1
3 Statistics p-value
4 -1.3469 0.178014
0 1 2
0 size driver_mutation_count
1 >1muts 5 0
2 >=1_LOSS 496 1
3 Statistics p-value
4 -1.34121 0.179852
In [18]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='snv_driver_mutation_count',
                  figsize=(8,12),
                  title='SNV Driver Mutation Count - TP53 Subroups - No WGD',
                  xlim=[-0.1,15])
plt.show()

display_side_by_side(get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='snv_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '0_HETLOSS']),
       
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='snv_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '>=1_LOSS']),
        
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='snv_driver_mutation_count', 
               group_list=['>1muts', '>=1_LOSS']))
0 1 2
0 size snv_driver_mutation_count
1 1_WILD_TYPE 23 1
2 0_HETLOSS 784 1
3 Statistics p-value
4 -1.72931 0.0837543
0 1 2
0 size snv_driver_mutation_count
1 1_WILD_TYPE 23 1
2 >=1_LOSS 496 1
3 Statistics p-value
4 -0.865456 0.386789
0 1 2
0 size snv_driver_mutation_count
1 >1muts 5 0
2 >=1_LOSS 496 1
3 Statistics p-value
4 -1.23099 0.218325
In [19]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='indel_driver_mutation_count',
                  figsize=(8,12),
                  title='INDEL Driver Mutation Count - TP53 Subroups - No WGD',
                  xlim=[-0.1,35])
plt.show()

display_side_by_side(get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '0_HETLOSS']),
       
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '>=1_LOSS']),
        
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['>1muts', '>=1_LOSS']))
0 1 2
0 size indel_driver_mutation_count
1 1_WILD_TYPE 23 0
2 0_HETLOSS 784 1
3 Statistics p-value
4 -2.36549 0.0180063
0 1 2
0 size indel_driver_mutation_count
1 1_WILD_TYPE 23 0
2 >=1_LOSS 496 0
3 Statistics p-value
4 -0.995594 0.319447
0 1 2
0 size indel_driver_mutation_count
1 >1muts 5 0
2 >=1_LOSS 496 0
3 Statistics p-value
4 -1.08818 0.276515

Genome Instability

The idea here is to see if we have differences in Fraction of Genome Altered if we cut our Cancer cohort on the number of drivers per sample.

Do we have more instability with more INDEL Driver Mutations within the same subgroup?

1_WILD_TYPE Subgroup

In [16]:
master_no_wgd_cancer_wt = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '1_WILD_TYPE']

thr=6

def get_driver_groups(x):
    if x.driver_mutation_count > thr:
        return 'High Co-Driver Count'
    if x.driver_mutation_count <= thr:
        return 'Low Co-Driver Count'
    

master_no_wgd_cancer_wt['co_driver_group'] = master_no_wgd_cancer_wt.apply(get_driver_groups, axis=1)
In [17]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_wt,
                  group='co_driver_group',
                  palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
                  order=['High Co-Driver Count', 'Low Co-Driver Count'],
                  metrics='frac_genome_altered',
                  figsize=(4,10),
                  title='Fraction of Genome Altered - 1_WILD_TYPE subgroup - Co Driver Count (thr={}) - {}'.format(thr,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_no_wgd_cancer_wt,
               group='co_driver_group',
               metrics='frac_genome_altered',
               group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
Out[17]:
0 1 2
0 size frac_genome_altered
1 High Co-Driver Count 43 0.094
2 Low Co-Driver Count 49 0.256
3 Statistics p-value
4 -4.55058 5.34987e-06
In [18]:
def plot_density(df: pd.DataFrame, xlabel='', ylabel='', title='',clip = (0,3), group = None,  figsize=(5,5)):
    sns.set_style("whitegrid", {'grid.color': '1.'})
    fig, ax = plt.subplots(figsize=figsize)
    
    
    data =df['tp53_ccf_1']
    ax = sns.distplot(data,kde_kws={'clip': clip, "shade": True}, hist=False)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    ax.set_title('TP53 CCF for 1_WT in Non WGD Samples'  + title + ' (' + str(len(data)) + ')', weight = 'bold')
    
    mean=round(data.mean(),2) ; median=round(data.median(),2)
    string = 'Mean: '+ str(mean) +'\nMedian: ' + str(median)
    ax.axvline(mean, color='g', linestyle='-', label='Mean: '+ str(mean))
    ax.axvline(median, color='b', linestyle='-', label='Median: ' + str(median))
    ax.legend()
    #ax.set_xlim([0,1])
    
    plt.show()
In [19]:
master_high_count = master_no_wgd_cancer_wt[master_no_wgd_cancer_wt['co_driver_group'] == 'High Co-Driver Count']
master_low_count = master_no_wgd_cancer_wt[master_no_wgd_cancer_wt['co_driver_group'] == 'Low Co-Driver Count']

plot_density(df=master_high_count,
             xlabel='TP53 CCF', 
             ylabel='density estimation',
             title=' - High Co-Driver Count',
             clip = (0,1), 
             group = None,  figsize=(7,3))
plt.show()

plot_density(df=master_low_count,
             xlabel='TP53 CCF', 
             ylabel='density estimation',
             title=' - Low Co-Driver Count',
             clip = (0,1), 
             group = None,  figsize=(7,3))

plt.show()

So we see that samples with less co-drivers have a higher Genome Instability

0_HETLOSS

In [20]:
master_no_wgd_cancer_het = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '0_HETLOSS']

thr=0

def get_driver_groups(x):
    if x.indel_driver_mutation_count > thr:
        return 'High Co-Driver Count'
    if x.indel_driver_mutation_count <= thr:
        return 'Low Co-Driver Count'
    

master_no_wgd_cancer_het['co_driver_group'] = master_no_wgd_cancer_het.apply(get_driver_groups, axis=1)
In [21]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_het,
                  group='co_driver_group',
                  palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
                  order=['High Co-Driver Count', 'Low Co-Driver Count'],
                  metrics='frac_genome_altered',
                  figsize=(4,10),
                  title='Fraction of Genome Altered - 0_HETLOSS subgroup - INDEL Co Driver Count (thr={}) - {}'.format(thr,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_no_wgd_cancer_het,
               group='co_driver_group',
               metrics='frac_genome_altered',
               group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
Out[21]:
0 1 2
0 size frac_genome_altered
1 High Co-Driver Count 98 0.283
2 Low Co-Driver Count 38 0.3095
3 Statistics p-value
4 -0.518931 0.603809

Low CCF Analysis

In [23]:
thr_ccf_1 = 0.9

def ccf_subgroup(x):
    if x.tp53_ccf_1 <= thr_ccf_1: return 'low'
    elif x.tp53_ccf_1 > thr_ccf_1: return 'high'

master_no_wgd_cancer['ccf_group'] = master_no_wgd_cancer.apply(ccf_subgroup, axis=1)
get_groupby(master_no_wgd_cancer, 'ccf_group', 'count')
Out[23]:
count
ccf_group
high 663
low 232
In [24]:
pd.DataFrame(master_no_wgd_cancer[['ccf_group', 'tp53_group']].groupby([ 'tp53_group', 'ccf_group']).size())
Out[24]:
0
tp53_group ccf_group
1_WILD_TYPE high 53
low 37
>1muts high 34
low 29
>=1_LOSS high 492
low 130
>=1_cnLOH high 81
low 28
HOMDEL high 1
low 6
In [25]:
master_no_wgd_cancer_low = master_no_wgd_cancer[(master_no_wgd_cancer['ccf_group'] == 'low') | (master_no_wgd_cancer['tp53_count'] == 0)]
get_groupby(master_no_wgd_cancer_low, 'tp53_group', 'count')
Out[25]:
count
tp53_group
0_HETLOSS 136
1_WILD_TYPE 37
>1muts 29
>=1_LOSS 130
>=1_cnLOH 28
HOMDEL 12

# of Drivers

In [26]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_low,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='indel_driver_mutation_count',
                  figsize=(8,12),
                  title='INDEL Driver Mutation Count - TP53 Subroups - No WGD - Low TP53 CCF',
                  xlim=[-0.1,30])
plt.show()

display_side_by_side(get_statistics(df=master_no_wgd_cancer_low,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '0_HETLOSS']),
       
       get_statistics(df=master_no_wgd_cancer_low,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '>=1_LOSS']),
        
       get_statistics(df=master_no_wgd_cancer_low,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['>1muts', '>=1_LOSS']))
0 1 2
0 size indel_driver_mutation_count
1 1_WILD_TYPE 37 2
2 0_HETLOSS 136 1
3 Statistics p-value
4 3.18934 0.00142598
0 1 2
0 size indel_driver_mutation_count
1 1_WILD_TYPE 37 2
2 >=1_LOSS 130 1
3 Statistics p-value
4 3.92101 8.81789e-05
0 1 2
0 size indel_driver_mutation_count
1 >1muts 29 2
2 >=1_LOSS 130 1
3 Statistics p-value
4 3.7377 0.000185714

GI in 1_WILD_TYPE

In [27]:
master_no_wgd_cancer_low_wt = master_no_wgd_cancer_low[master_no_wgd_cancer_low['tp53_group'] == '1_WILD_TYPE']

thr=6

def get_driver_groups(x):
    if x.driver_mutation_count > thr:
        return 'High Co-Driver Count'
    if x.driver_mutation_count <= thr:
        return 'Low Co-Driver Count'
    

master_no_wgd_cancer_low_wt['co_driver_group'] = master_no_wgd_cancer_low_wt.apply(get_driver_groups, axis=1)

fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_low_wt,
                  group='co_driver_group',
                  palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
                  order=['High Co-Driver Count', 'Low Co-Driver Count'],
                  metrics='frac_genome_altered',
                  figsize=(4,10),
                  title='Fraction of Genome Altered - 1_WILD_TYPE subgroup - Low TP53 CCF - Co Driver Count (thr={}) - {}'.format(thr,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_no_wgd_cancer_low_wt,
               group='co_driver_group',
               metrics='frac_genome_altered',
               group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
Out[27]:
0 1 2
0 size frac_genome_altered
1 High Co-Driver Count 16 0.076
2 Low Co-Driver Count 21 0.128
3 Statistics p-value
4 -1.83942 0.0658537

High CCF Analysis

In [28]:
master_no_wgd_cancer_high = master_no_wgd_cancer[(master_no_wgd_cancer['ccf_group'] == 'high') | (master_no_wgd_cancer['tp53_count'] == 0)]
get_groupby(master_no_wgd_cancer_high, 'tp53_group', 'count')
Out[28]:
count
tp53_group
0_HETLOSS 136
1_WILD_TYPE 53
>1muts 34
>=1_LOSS 492
>=1_cnLOH 81
HOMDEL 7

# of Drivers

In [29]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_high,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='driver_mutation_count',
                  figsize=(8,12),
                  title='Driver Mutation Count - TP53 Subroups - No WGD - High TP53 CCF',
                  xlim=[-0.1,40])
plt.show()

display_side_by_side(get_statistics(df=master_no_wgd_cancer_high,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '0_HETLOSS']),
       
       get_statistics(df=master_no_wgd_cancer_high,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '>=1_LOSS']),
        
       get_statistics(df=master_no_wgd_cancer_high,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['>1muts', '>=1_LOSS']))
0 1 2
0 size indel_driver_mutation_count
1 1_WILD_TYPE 53 4
2 0_HETLOSS 136 1
3 Statistics p-value
4 4.63256 3.61177e-06
0 1 2
0 size indel_driver_mutation_count
1 1_WILD_TYPE 53 4
2 >=1_LOSS 492 1
3 Statistics p-value
4 5.93346 2.96617e-09
0 1 2
0 size indel_driver_mutation_count
1 >1muts 34 2
2 >=1_LOSS 492 1
3 Statistics p-value
4 4.59217 4.38651e-06

GI in 1_WILD_TYPE

In [30]:
master_no_wgd_cancer_high_wt = master_no_wgd_cancer_high[master_no_wgd_cancer_high['tp53_group'] == '1_WILD_TYPE']

thr=6

def get_driver_groups(x):
    if x.driver_mutation_count > thr:
        return 'High Co-Driver Count'
    if x.driver_mutation_count <= thr:
        return 'Low Co-Driver Count'
    

master_no_wgd_cancer_high_wt['co_driver_group'] = master_no_wgd_cancer_high_wt.apply(get_driver_groups, axis=1)

fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_high_wt,
                  group='co_driver_group',
                  palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
                  order=['High Co-Driver Count', 'Low Co-Driver Count'],
                  metrics='frac_genome_altered',
                  figsize=(4,10),
                  title='Fraction of Genome Altered - 1_WILD_TYPE subgroup - High TP53 CCF - Co Driver Count (thr={}) - {}'.format(thr,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_no_wgd_cancer_high_wt,
               group='co_driver_group',
               metrics='frac_genome_altered',
               group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
Out[30]:
0 1 2
0 size frac_genome_altered
1 High Co-Driver Count 26 0.098
2 Low Co-Driver Count 27 0.318
3 Statistics p-value
4 -4.86612 1.13812e-06
In [33]:
get_hotspot_frac(df=master_no_wgd_cancer_high_wt[master_no_wgd_cancer_high_wt['co_driver_group'] == 'High Co-Driver Count'],
                group_type=None,
                group=None)
Out[33]:
0 1 2
0 spot # frac
1 90 3 0.089
2 382 3 0.153
3 273 3 0.094
4 342 2 0.0855
5 248 1 0.084
6 73 1 0
7 394 1 0.281
8 282 1 0.106
9 249 1 0.036
10 113 1 0.055
In [34]:
get_hotspot_frac(df=master_no_wgd_cancer_high_wt[master_no_wgd_cancer_high_wt['co_driver_group'] == 'Low Co-Driver Count'],
                group_type=None,
                group=None)
Out[34]:
0 1 2
0 spot # frac
1 175 5 0.303
2 273 4 0.302
3 135 2 0.341
4 151 2 0.4435
5 248 2 0.467
6 282 2 0.3425
7 nan 2 0.395
8 132 1 0.337
9 138 1 0.174
10 181 1 0.058
In [53]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_high_wt,
                  group='co_driver_group',
                  palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
                  order=['High Co-Driver Count', 'Low Co-Driver Count'],
                  metrics='Patient_Current_Age',
                  figsize=(4,10),
                  title='Patient Age - 1_WILD_TYPE subgroup - High TP53 CCF - Co Driver Count (thr={}) - {}'.format(thr,cancer),
                  xlim=[20,100])
plt.show()
In [69]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis', fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(['High Co-Driver Count', 'Low Co-Driver Count'], range(len(['High Co-Driver Count', 'Low Co-Driver Count']))):
    data = master_no_wgd_cancer_high_wt[master_no_wgd_cancer_high_wt['co_driver_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
    kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
    kmf.plot_survival_function(color = ['#FF9900' ,'#146EB4'][i], ax=ax)
plt.show()
In [ ]:
 

Co Driver Analysis

In [20]:
codrivers_cancer = get_major_codrivers(master=master_no_wgd_cancer,
                    maf=maf_cohort_nowgd,
                    head=15)

codrivers_cancer_tp53 = get_major_codrivers(master=master_no_wgd_cancer[master_no_wgd_cancer['tp53_count'] >= 1],
                    maf=maf_cohort_nowgd,
                    head=15)
In [21]:
co_drivers = pd.merge(codrivers_cancer, codrivers_cancer_tp53, on='Hugo_Symbol')
co_drivers.columns = ['cancer', 'cancer_tp53']
In [22]:
co_drivers['ratio'] = co_drivers.apply(lambda x: 100*round(x.cancer_tp53/x.cancer, 4) , axis=1)
co_drivers = co_drivers.sort_values(by='ratio', ascending=False)
In [23]:
co_drivers
Out[23]:
cancer cancer_tp53 ratio
Hugo_Symbol
NF1 85 41 48.24
MAP2K4 91 36 39.56
AKT1 153 42 27.45
PTEN 154 41 26.62
KMT2C 170 33 19.41
PIK3CA 1079 205 19.00
ARID1A 173 29 16.76
TBX3 108 18 16.67
CDH1 442 63 14.25
GATA3 378 51 13.49
ESR1 183 22 12.02
MAP3K1 245 22 8.98
In [24]:
labels = []
for element in co_drivers.index.tolist():
    labels.append(element + ' ('+ str(int(co_drivers.loc[element]['cancer']))+')')

ax = sns.barplot(y=co_drivers.index, x='ratio',data=co_drivers[['ratio']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Co-Drivers Enrichment in TP53 State')
Out[24]:
Text(0.5, 1.0, 'Co-Drivers Enrichment in TP53 State')
In [25]:
codrivers_cancer
Out[25]:
count
Hugo_Symbol
PIK3CA 1079
CDH1 442
GATA3 378
MAP3K1 245
ESR1 183
ARID1A 173
KMT2C 170
PTEN 154
AKT1 153
CBFB 117
TBX3 108
FOXA1 104
MAP2K4 91
ERBB2 89
NF1 85
In [26]:
labels = []
codrivers_cancer = get_major_codrivers(master=master_no_wgd_cancer,
                                       maf=maf_cohort_nowgd,
                                       head=15,
                                      tp53=True)

codrivers_cancer['proportion'] = codrivers_cancer.apply(lambda x: 100* round(x['count'] / codrivers_cancer.sum().values[0], 4), axis=1)

for element in codrivers_cancer.head(10).index.tolist():
    labels.append(element + ' ('+ str(int(codrivers_cancer.loc[element]['count']))+')')

ax = sns.barplot(y=codrivers_cancer.head(10).index, x='proportion',data=codrivers_cancer.head(10)[['proportion']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Drivers Frequency in {}'.format(cancer))
Out[26]:
Text(0.5, 1.0, 'Drivers Frequency in Breast Cancer')

Co-Drivers per subgroup

In [27]:
co_drivers_res = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_res_group',
                                                group_1='tp53_res',
                                                group_2='no_tp53_res')
co_drivers_res
Out[27]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
PIK3CA 361 -19.64 188 18.54
CDH1 193 -10.50 62 6.11
GATA3 155 -8.43 51 5.03
ESR1 68 -3.70 24 2.37
ARID1A 67 -3.65 25 2.47
PTEN 55 -2.99 39 3.85
TBX3 55 -2.99 20 1.97
MAP3K1 54 -2.94 16 1.58
AKT1 48 -2.61 36 3.55
MAP2K4 47 -2.56 33 3.25
KMT2C 47 -2.56 32 3.16
CBFB 41 -2.23 9 0.89
FOXA1 39 -2.12 14 1.38
NCOR1 36 -1.96 20 1.97
ERBB2 35 -1.90 14 1.38
RUNX1 32 -1.74 13 1.28
SPEN 28 -1.52 11 1.08
NF1 26 -1.41 42 4.14
SF3B1 26 -1.41 7 0.69
SMAD4 21 -1.14 15 1.48
BRCA2 18 -0.98 11 1.08
RB1 17 -0.92 31 3.06
GPS2 16 -0.87 8 0.79
CDKN1B 12 -0.65 3 0.30
RHOA 12 -0.65 6 0.59
ERBB3 11 -0.60 3 0.30
MGA 10 -0.54 2 0.20
CDKN2A 10 -0.54 8 0.79
CREBBP 10 -0.54 2 0.20
SETD2 10 -0.54 4 0.39
KRAS 10 -0.54 7 0.69
KMT2A 9 -0.49 5 0.49
JAK1 9 -0.49 6 0.59
PIK3R1 8 -0.44 20 1.97
CTCF 7 -0.38 2 0.20
INPPL1 7 -0.38 6 0.59
FOXP1 7 -0.38 3 0.30
ARID2 7 -0.38 11 1.08
ZFHX3 6 -0.33 5 0.49
POLE 5 -0.27 4 0.39
PIK3R3 5 -0.27 3 0.30
SPOP 5 -0.27 2 0.20
FAT1 5 -0.27 6 0.59
KMT2D 5 -0.27 12 1.18
FGFR2 5 -0.27 2 0.20
ATM 5 -0.27 3 0.30
PTPRT 5 -0.27 2 0.20
ASXL2 5 -0.27 2 0.20
ARID1B 5 -0.27 2 0.20
KDM6A 4 -0.22 8 0.79
SOX9 4 -0.22 4 0.39
PTPRD 4 -0.22 3 0.30
NOTCH2 4 -0.22 4 0.39
ATRX 4 -0.22 4 0.39
MTOR 3 -0.16 3 0.30
STK11 3 -0.16 4 0.39
BRAF 3 -0.16 2 0.20
CASP8 3 -0.16 3 0.30
NSD1 3 -0.16 4 0.39
NUP93 3 -0.16 6 0.59
KMT2B 3 -0.16 3 0.30
STAG2 3 -0.16 2 0.20
KDM5C 3 -0.16 2 0.20
HIST1H3B 3 -0.16 2 0.20
TSC1 3 -0.16 2 0.20
FLT3 2 -0.11 2 0.20
BCOR 2 -0.11 3 0.30
PBRM1 2 -0.11 4 0.39
In [32]:
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_res[['proportion_1', 'proportion_2']].head(10)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = ['#2ECC71','#1E8449'])
ax.legend(['TP53 Residual', 'No TP53 Residual'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [25, 20, 15, 10, 5, 0, 5, 10, 15, 20, 25]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()
In [33]:
co_drivers_cnloh_loss = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_group',
                                                group_1='>=1_cnLOH',
                                                group_2='>=1_LOSS')
co_drivers_cnloh_loss
Out[33]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
PIK3CA 16 -16.0 171 18.71
RB1 6 -6.0 26 2.84
MAP3K1 3 -3.0 15 1.64
KMT2C 3 -3.0 30 3.28
GATA3 3 -3.0 46 5.03
AKT1 3 -3.0 36 3.94
STK11 2 -2.0 2 0.22
PTEN 2 -2.0 37 4.05
KMT2D 2 -2.0 9 0.98
NCOR1 2 -2.0 15 1.64
NOTCH2 2 -2.0 2 0.22
INPPL1 2 -2.0 4 0.44
PIK3R3 2 -2.0 1 0.11
ESR1 2 -2.0 20 2.19
ARID2 2 -2.0 7 0.77
CDH1 2 -2.0 57 6.24
BRCA1 2 -2.0 6 0.66
RUNX1 1 -1.0 14 1.53
SPEN 1 -1.0 12 1.31
PIK3R1 1 -1.0 19 2.08
TERT 1 -1.0 4 0.44
SMAD4 1 -1.0 14 1.53
NF1 1 -1.0 39 4.27
SMARCA4 1 -1.0 4 0.44
MTOR 1 -1.0 2 0.22
ARID1A 1 -1.0 25 2.74
BCOR 1 -1.0 3 0.33
BRCA2 1 -1.0 9 0.98
CBFB 1 -1.0 6 0.66
CDK12 1 -1.0 3 0.33
KRAS 1 -1.0 5 0.55
ERBB2 1 -1.0 14 1.53
FAT1 1 -1.0 4 0.44
FOXP1 1 -1.0 2 0.22
KEAP1 1 -1.0 3 0.33
KMT2A 1 -1.0 4 0.44
In [37]:
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_cnloh_loss[['proportion_1', 'proportion_2']].head(10)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[4],mc[0]])
ax.legend(['>=1_cnLOH', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [20, 15, 10, 5, 0, 5, 10, 15, 20, 25]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()

Proportion are the same in both groups

In [54]:
co_drivers_losses = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_group',
                                                group_1='0_HETLOSS',
                                                group_2='>=1_LOSS')
co_drivers_losses
Out[54]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
PIK3CA 351 -19.53 171 18.71
CDH1 193 -10.74 57 6.24
GATA3 153 -8.51 46 5.03
ESR1 68 -3.78 20 2.19
ARID1A 67 -3.73 25 2.74
TBX3 55 -3.06 18 1.97
PTEN 53 -2.95 37 4.05
MAP3K1 50 -2.78 15 1.64
KMT2C 47 -2.62 30 3.28
MAP2K4 47 -2.62 36 3.94
AKT1 46 -2.56 36 3.94
CBFB 41 -2.28 6 0.66
FOXA1 38 -2.11 14 1.53
NCOR1 36 -2.00 15 1.64
ERBB2 35 -1.95 14 1.53
RUNX1 29 -1.61 14 1.53
NF1 26 -1.45 39 4.27
SPEN 26 -1.45 12 1.31
SF3B1 25 -1.39 6 0.66
SMAD4 21 -1.17 14 1.53
BRCA2 18 -1.00 9 0.98
RB1 16 -0.89 26 2.84
GPS2 15 -0.83 8 0.88
RHOA 12 -0.67 6 0.66
CDKN1B 12 -0.67 3 0.33
KRAS 10 -0.56 5 0.55
CDKN2A 10 -0.56 8 0.88
SETD2 10 -0.56 4 0.44
ERBB3 10 -0.56 3 0.33
KMT2A 9 -0.50 4 0.44
JAK1 9 -0.50 5 0.55
MGA 8 -0.45 2 0.22
PIK3R1 8 -0.45 19 2.08
CTCF 7 -0.39 2 0.22
INPPL1 7 -0.39 4 0.44
ARID2 7 -0.39 7 0.77
FOXP1 6 -0.33 2 0.22
ZFHX3 6 -0.33 5 0.55
POLE 5 -0.28 4 0.44
FAT1 5 -0.28 4 0.44
ATM 5 -0.28 4 0.44
FGFR2 5 -0.28 2 0.22
KMT2D 5 -0.28 9 0.98
PTPRT 5 -0.28 2 0.22
ATRX 4 -0.22 4 0.44
KDM6A 4 -0.22 8 0.88
SOX9 4 -0.22 4 0.44
SPOP 4 -0.22 2 0.22
PIK3R3 4 -0.22 1 0.11
NOTCH2 4 -0.22 2 0.22
PTPRD 4 -0.22 3 0.33
KMT2B 3 -0.17 3 0.33
NUP93 3 -0.17 6 0.66
NSD1 3 -0.17 4 0.44
KDM5C 3 -0.17 2 0.22
CASP8 3 -0.17 3 0.33
STAG2 3 -0.17 1 0.11
MTOR 2 -0.11 2 0.22
TET2 2 -0.11 6 0.66
PBRM1 2 -0.11 3 0.33
BCOR 2 -0.11 3 0.33
STK11 2 -0.11 2 0.22
MSH2 2 -0.11 2 0.22
In [39]:
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_losses[['proportion_1', 'proportion_2']].head(10)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[5],mc[0]])
ax.legend(['0_HETLOSS', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
#a = [-40, -30, -20, -10, 0, 10, 20, 30, 40]
#ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()

Same proportions, enrichment in APC and KRAS


In [40]:
co_drivers_mult_cnloh = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_group',
                                                group_1='>1muts',
                                                group_2='>=1_cnLOH')
co_drivers_mult_cnloh
Out[40]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
BRCA2 1 -12.5 1 1.0
CREBBP 1 -12.5 1 1.0
PTEN 1 -12.5 2 2.0
In [41]:
get_major_codrivers(master=master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>1muts'],
                    maf=maf_cohort_nowgd,
                    head=100)
Out[41]:
count
Hugo_Symbol
ABL1 1
BRCA2 1
CREBBP 1
JAK3 1
MAP2K1 1
MYCN 1
PTEN 1
SPOP 1
In [42]:
fig=plt.figure(figsize=(10,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_mult_cnloh[['proportion_1', 'proportion_2']].head(20)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[3],mc[4]])
ax.legend(['>1muts', '>=1_cnLOH'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=15)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-20, -10, 0, 10, 20, 30, 40]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()
In [43]:
co_drivers_wt_loss = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_group',
                                                group_1='1_WILD_TYPE',
                                                group_2='>=1_LOSS')
co_drivers_wt_loss
Out[43]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
PIK3CA 10 -25.0 171 18.71
MAP3K1 4 -10.0 15 1.64
RUNX1 3 -7.5 14 1.53
AKT1 2 -5.0 36 3.94
SPEN 2 -5.0 12 1.31
GATA3 2 -5.0 46 5.03
MGA 2 -5.0 2 0.22
SF3B1 1 -2.5 6 0.66
RB1 1 -2.5 26 2.84
PTEN 1 -2.5 37 4.05
PIK3R3 1 -2.5 1 0.11
MTOR 1 -2.5 2 0.22
PAX5 1 -2.5 2 0.22
GPS2 1 -2.5 8 0.88
FOXP1 1 -2.5 2 0.22
FOXA1 1 -2.5 14 1.53
ERBB3 1 -2.5 3 0.33
STK11 1 -2.5 2 0.22
In [45]:
fig=plt.figure(figsize=(10,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_wt_loss[['proportion_1', 'proportion_2']].head(20)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[2],mc[0]])
ax.legend(['1_WILD_TYPE', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=15)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
#a = [-10, 0, 10, 20, 30, 40]
#ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()

Comparison with WGD Cohort (WGD - TP53 - LOH)

In [46]:
def get_major_codrivers(master: pd.DataFrame, maf: pd.DataFrame, head:int = 10):
    samples = master.Tumor_Id.tolist()
    maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True][maf['Hugo_Symbol'] != 'TP53']
    h = pd.DataFrame(maf_filtered[['Hugo_Symbol']].groupby(['Hugo_Symbol']).size())
    h.columns = ['count']
    h = h.sort_values(by='count', ascending=False).head(head)
    
    return(h)

def create_co_drivers_table_wgd(master_1: pd.DataFrame, master_2: pd.DataFrame, group_type:str,  group_1: str):
    master_group_1 = master_1[master_1[group_type] == group_1]
    co_drivers_group_1 = get_major_codrivers(master=master_group_1,
                        maf=maf_cohort_nowgd,
                       head=100)
    co_drivers_group_1['proportion_1'] = co_drivers_group_1.apply(lambda x: 100* round(x['count'] / co_drivers_group_1.sum().values[0], 4), axis=1)
    
    master_group_2 = master_2[master_2['tp53_count'] >=1][master_2['tp53_loh_status'] == True]
    co_drivers_group_2 = get_major_codrivers(master=master_group_2,
                        maf=maf_cohort_wgd,
                       head=100)
    co_drivers_group_2['proportion_2'] = co_drivers_group_2.apply(lambda x: 100* round(x['count'] / co_drivers_group_2.sum().values[0], 4), axis=1)
    
    co_drivers_groups = pd.merge(co_drivers_group_2, co_drivers_group_1, on='Hugo_Symbol')
    co_drivers_groups['proportion_2'] = - co_drivers_groups['proportion_2']
    
    return co_drivers_groups
In [47]:
co_drivers_wgd_loss = create_co_drivers_table_wgd(master_1=master_no_wgd_cancer,
                            master_2=master_wgd_cancer,
                            group_type='tp53_group',
                            group_1='>=1_LOSS')
co_drivers_wgd_loss
Out[47]:
count_x proportion_2 count_y proportion_1
Hugo_Symbol
PIK3CA 149 -20.11 171 18.71
PTEN 47 -6.34 37 4.05
RB1 33 -4.45 26 2.84
KMT2C 26 -3.51 30 3.28
MAP3K1 26 -3.51 15 1.64
AKT1 24 -3.24 36 3.94
GATA3 24 -3.24 46 5.03
NF1 22 -2.97 39 4.27
PIK3R1 20 -2.70 19 2.08
RUNX1 20 -2.70 14 1.53
CDH1 16 -2.16 57 6.24
NCOR1 14 -1.89 15 1.64
TBX3 13 -1.75 18 1.97
ESR1 12 -1.62 20 2.19
BRCA1 11 -1.48 6 0.66
MAP2K4 10 -1.35 36 3.94
ARID1A 9 -1.21 25 2.74
ERBB2 8 -1.08 14 1.53
CDKN2A 8 -1.08 8 0.88
STK11 7 -0.94 2 0.22
NOTCH3 7 -0.94 5 0.55
KMT2D 7 -0.94 9 0.98
ARID2 7 -0.94 7 0.77
FLT3 6 -0.81 2 0.22
TET2 6 -0.81 6 0.66
INPPL1 5 -0.67 4 0.44
FAT1 5 -0.67 4 0.44
SF3B1 5 -0.67 6 0.66
BAP1 5 -0.67 2 0.22
TERT 5 -0.67 4 0.44
MGA 4 -0.54 2 0.22
SPEN 4 -0.54 12 1.31
CDKN1B 4 -0.54 3 0.33
KRAS 4 -0.54 5 0.55
ATM 4 -0.54 4 0.44
ERBB3 4 -0.54 3 0.33
JAK1 4 -0.54 5 0.55
CBFB 4 -0.54 6 0.66
KDM6A 4 -0.54 8 0.88
KMT2A 4 -0.54 4 0.44
CDK12 3 -0.40 3 0.33
STAG2 3 -0.40 1 0.11
TSC2 3 -0.40 2 0.22
INPP4B 3 -0.40 2 0.22
NUP93 3 -0.40 6 0.66
SMAD4 3 -0.40 14 1.53
BRCA2 3 -0.40 9 0.98
TCF7L2 3 -0.40 1 0.11
SETD2 3 -0.40 4 0.44
FOXA1 3 -0.40 14 1.53
PTPRD 2 -0.27 3 0.33
SOX9 2 -0.27 4 0.44
SMARCA4 2 -0.27 4 0.44
NOTCH2 2 -0.27 2 0.22
PBRM1 2 -0.27 3 0.33
PMS1 2 -0.27 1 0.11
PRDM1 2 -0.27 1 0.11
KMT2B 2 -0.27 3 0.33
ZFHX3 2 -0.27 5 0.55
CIC 2 -0.27 2 0.22
ATRX 2 -0.27 4 0.44
ATR 2 -0.27 2 0.22
FGFR4 2 -0.27 3 0.33
SDHA 1 -0.13 1 0.11
CASP8 1 -0.13 3 0.33
RHOA 1 -0.13 6 0.66
BCOR 1 -0.13 3 0.33
In [49]:
fig=plt.figure(figsize=(8,8))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_wgd_loss[['proportion_2', 'proportion_1']].head(15)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = ['#7F8C8D',mc[0]])
ax.legend(['WGD - TP53 - LOH', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
#a = [-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50]
#ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()

Genome Instability Associated

In [51]:
def get_master_codrivers(master: pd.DataFrame, maf: pd.DataFrame, symbol: str):
    samples = master.Tumor_Id.tolist()
    samples_final = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['Hugo_Symbol'] == symbol].Tumor_Sample_Barcode.tolist()

    master_filtered = master[master.Tumor_Id.isin(samples_final)]
    
    return master_filtered

>=1_cnLOH

In [55]:
master_no_wgd_cancer_cnloh = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>=1_cnLOH']
master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_cnloh,
                                   maf=maf_cohort_nowgd,
                                   symbol='PIK3CA')

master_RB1 = get_master_codrivers(master=master_no_wgd_cancer_cnloh,
                                   maf=maf_cohort_nowgd,
                                   symbol='RB1')

master_no_wgd_cancer_cnloh['data'] = '>=1_cnLOH'
master_RB1['data'] = 'RB1'
master_PIK3CA['data'] = 'PIK3CA'

masters = [master_no_wgd_cancer_cnloh, master_RB1, master_PIK3CA]
allMasters = pd.concat(masters)
In [56]:
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >=1_cnLOH')
ax.set_xlabel('')
Out[56]:
Text(0.5, 0, '')

>=1_LOSS

In [57]:
master_no_wgd_cancer_loss = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>=1_LOSS']
master_CDH1 = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='CDH1')

master_GATA3 = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='GATA3')

master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='PIK3CA')

master_no_wgd_cancer_loss['data'] = '>=1_loss'
master_CDH1['data'] = 'CDH1'
master_GATA3['data'] = 'GATA3'
master_PIK3CA['data'] = 'PIK3CA'

masters = [master_no_wgd_cancer_loss, master_CDH1, master_GATA3, master_PIK3CA]
allMasters = pd.concat(masters)
In [58]:
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >=1_LOSS')
ax.set_xlabel('')
Out[58]:
Text(0.5, 0, '')

0_HETLOSS

In [59]:
master_no_wgd_cancer_loss = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '0_HETLOSS']
master_ = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='APC')

master_CDH1 = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='CDH1')

master_GATA3 = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='GATA3')

master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='PIK3CA')

master_no_wgd_cancer_loss['data'] = '>=1_loss'
master_CDH1['data'] = 'CDH1'
master_GATA3['data'] = 'GATA3'
master_PIK3CA['data'] = 'PIK3CA'

masters = [master_no_wgd_cancer_loss, master_CDH1, master_GATA3, master_PIK3CA]
allMasters = pd.concat(masters)
In [61]:
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - 0_HETLOSS')
ax.set_xlabel('')
Out[61]:
Text(0.5, 0, '')

1_WILD_TYPE

In [62]:
master_no_wgd_cancer_wt = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '1_WILD_TYPE']

master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_wt,
                                   maf=maf_cohort_nowgd,
                                   symbol='PIK3CA')

master_MAP3K1 = get_master_codrivers(master=master_no_wgd_cancer_wt,
                                   maf=maf_cohort_nowgd,
                                   symbol='MAP3K1')


master_no_wgd_cancer_wt['data'] = '1_WT'
master_PIK3CA['data'] = 'PIK3CA'
master_MAP3K1['data'] = 'MAP3K1'

masters = [master_no_wgd_cancer_wt, master_PIK3CA, master_MAP3K1]
allMasters = pd.concat(masters)
In [63]:
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - 1_WT')
ax.set_xlabel('')
Out[63]:
Text(0.5, 0, '')

CCF / VAF Analysis

Same here we take only samples with exactly 1 tp53 mutation (master_hotspot).

We have to define groups for CCF to see if there are differences between those groups. To have an idea of the CCF distribution we show here the distribution coming from the cancer_panel.

We see that our tp53_ccf distribution is very high for all subgroups. >=1_LOSS and 0_HETLOSS are the biggest subgroups - by far - and >=1_LOSS* has a very high CCF median.

It will be hard to cut the cohort based on the CCF. Let's try and see the size of the subcohorts:

In [64]:
master_ccf = master_no_wgd_cancer[(master_no_wgd_cancer['tp53_count'] == 1) | (master_no_wgd_cancer['tp53_group'] == '0_HETLOSS')]
In [66]:
thr_ccf_1 = 0.9 ; thr_ccf_2 = 0.95

def ccf_subgroup(x):
    if x.tp53_ccf_1 <= thr_ccf_1: return 'low'
    elif x.tp53_ccf_1 <= thr_ccf_2: return 'medium'
    elif x.tp53_ccf_1 > thr_ccf_2: return 'high'

master_ccf['ccf_group'] = master_ccf.apply(ccf_subgroup, axis=1)
get_groupby(master_ccf, 'ccf_group', 'count')
Out[66]:
count
ccf_group
high 366
low 145
medium 64
In [67]:
thr_vaf_1 = 0.3 ; thr_vaf_2 = 0.4

def vaf_subgroup(x):
    if x.tp53_vaf_1 <= thr_vaf_1: return 'low'
    elif x.tp53_vaf_1 <= thr_vaf_2: return 'medium'
    elif x.tp53_vaf_1 > thr_vaf_2: return 'high'

master_ccf['vaf_group'] = master_ccf.apply(vaf_subgroup, axis=1)       
get_groupby(master_ccf, 'vaf_group', 'count')
Out[67]:
count
vaf_group
high 214
low 273
medium 89

VAF Analysis

No VAF Cut

In [68]:
fig, ax = boxplot_sampletype(df=master_ccf,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - {}'.format(cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_ccf,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['0_HETLOSS', '1_WILD_TYPE'])
Out[68]:
0 1 2
0 size frac_genome_altered
1 0_HETLOSS 784 0.2275
2 1_WILD_TYPE 23 0.176
3 Statistics p-value
4 1.83776 0.0660983

Low VAF

In [69]:
master_low = master_ccf[(master_ccf['vaf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_low,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - VAF < {} - {}'.format(thr_vaf_1,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_low,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '>=1_LOSS'])
Out[69]:
0 1 2
0 size frac_genome_altered
1 1_WILD_TYPE 23 0.176
2 >=1_LOSS 219 0.3
3 Statistics p-value
4 -3.35657 0.000789144

Medium VAF

In [70]:
master_med = master_ccf[(master_ccf['vaf_group'] == 'medium') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_med,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - {} < VAF < {} - {}'.format(thr_vaf_1,thr_vaf_2,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_med,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: '1_WILD_TYPE'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-70-b5c51832bd39> in <module>
      8                   figsize=(5,10),
      9                   title='Fraction of Genome Altered - {} < VAF < {} - {}'.format(thr_vaf_1,thr_vaf_2,cancer),
---> 10                   xlim=[0,1])
     11 plt.show()
     12 

<ipython-input-3-27302fc2b541> in boxplot_sampletype(df, group, palette, order, metrics, figsize, title, title_font, xlim)
     41     labels = []
     42     for element in order:
---> 43         labels.append(element + '\n('+ str(groupby_.loc[element].values[0])+')')
     44 
     45 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1765 
   1766             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1767             return self._getitem_axis(maybe_callable, axis=axis)
   1768 
   1769     def _is_scalar_access(self, key: Tuple):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1962         # fall thru to straight lookup
   1963         self._validate_key(key, axis)
-> 1964         return self._get_label(key, axis=axis)
   1965 
   1966 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
    622             raise IndexingError("no slices here, handle elsewhere")
    623 
--> 624         return self.obj._xs(label, axis=axis)
    625 
    626     def _get_loc(self, key: int, axis: int):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3535             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3536         else:
-> 3537             loc = self.index.get_loc(key)
   3538 
   3539             if isinstance(loc, np.ndarray):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: '1_WILD_TYPE'

High VAF

In [71]:
master_high = master_ccf[(master_ccf['vaf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_high,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - VAF > {} - {}'.format(thr_vaf_2,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_high,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: '1_WILD_TYPE'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-71-d72ad9dc6c47> in <module>
      8                   figsize=(5,10),
      9                   title='Fraction of Genome Altered - VAF > {} - {}'.format(thr_vaf_2,cancer),
---> 10                   xlim=[0,1])
     11 plt.show()
     12 

<ipython-input-3-27302fc2b541> in boxplot_sampletype(df, group, palette, order, metrics, figsize, title, title_font, xlim)
     41     labels = []
     42     for element in order:
---> 43         labels.append(element + '\n('+ str(groupby_.loc[element].values[0])+')')
     44 
     45 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1765 
   1766             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1767             return self._getitem_axis(maybe_callable, axis=axis)
   1768 
   1769     def _is_scalar_access(self, key: Tuple):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1962         # fall thru to straight lookup
   1963         self._validate_key(key, axis)
-> 1964         return self._get_label(key, axis=axis)
   1965 
   1966 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
    622             raise IndexingError("no slices here, handle elsewhere")
    623 
--> 624         return self.obj._xs(label, axis=axis)
    625 
    626     def _get_loc(self, key: int, axis: int):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3535             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3536         else:
-> 3537             loc = self.index.get_loc(key)
   3538 
   3539             if isinstance(loc, np.ndarray):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: '1_WILD_TYPE'

CCF Analysis

No CCF Cut

In [72]:
fig, ax = boxplot_sampletype(df=master_ccf,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - {}'.format(cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_ccf,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['0_HETLOSS', '1_WILD_TYPE'])
Out[72]:
0 1 2
0 size frac_genome_altered
1 0_HETLOSS 784 0.2275
2 1_WILD_TYPE 23 0.176
3 Statistics p-value
4 1.83776 0.0660983

Low CCF

In [73]:
master_low = master_ccf[(master_ccf['ccf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_low,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - CCF < {} - {}'.format(thr_ccf_1,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_low,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
Out[73]:
0 1 2
0 size frac_genome_altered
1 1_WILD_TYPE 14 0.0935
2 0_HETLOSS 784 0.2275
3 Statistics p-value
4 -2.7384 0.00617393

Medium CCF

In [74]:
master_med = master_ccf[(master_ccf['ccf_group'] == 'medium') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_med,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - {} < CCF < {} - {}'.format(thr_ccf_1,thr_ccf_2,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_med,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
Out[74]:
0 1 2
0 size frac_genome_altered
1 1_WILD_TYPE 2 0.2175
2 0_HETLOSS 784 0.2275
3 Statistics p-value
4 -0.19334 0.846693

High CCF

In [75]:
master_high = master_ccf[(master_ccf['ccf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_high,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - CCF > {} - {}'.format(thr_ccf_2,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_high,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
Out[75]:
0 1 2
0 size frac_genome_altered
1 1_WILD_TYPE 7 0.292
2 0_HETLOSS 784 0.2275
3 Statistics p-value
4 0.628077 0.529954

Splitting on the different level of CCF / VAF

In [76]:
fig, ax = boxplot_sampletype(df=master_hotspot,
                  group='vaf_group',
                  palette={'low': tab10[0] , 'medium': tab10[1], 'high':tab10[2]},
                  order=['low', 'medium', 'high'],
                  metrics='frac_genome_altered',
                  figsize=(3,10),
                  title='Fraction of Genome Altered - VAF levels - {}'.format(cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_hotspot,
               group='vaf_group',
               metrics='frac_genome_altered',
               group_list=['low', 'medium'])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-76-db6ca157e882> in <module>
      6                   figsize=(3,10),
      7                   title='Fraction of Genome Altered - VAF levels - {}'.format(cancer),
----> 8                   xlim=[0,1])
      9 plt.show()
     10 

<ipython-input-3-27302fc2b541> in boxplot_sampletype(df, group, palette, order, metrics, figsize, title, title_font, xlim)
     35     ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
     36 
---> 37     sns.boxplot(y=metrics, x=group,data=df,ax=ax, dodge=False,order=order, palette=palette).set_title(title, weight='bold', fontsize=title_font)
     38 
     39     groupby_ = get_groupby(df,group, 'count')

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/seaborn/categorical.py in boxplot(x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth, whis, ax, **kwargs)
   2239     plotter = _BoxPlotter(x, y, hue, data, order, hue_order,
   2240                           orient, color, palette, saturation,
-> 2241                           width, dodge, fliersize, linewidth)
   2242 
   2243     if ax is None:

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/seaborn/categorical.py in __init__(self, x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth)
    441                  width, dodge, fliersize, linewidth):
    442 
--> 443         self.establish_variables(x, y, hue, data, orient, order, hue_order)
    444         self.establish_colors(color, palette, saturation)
    445 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/seaborn/categorical.py in establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
    150                 if isinstance(var, str):
    151                     err = "Could not interpret input '{}'".format(var)
--> 152                     raise ValueError(err)
    153 
    154             # Figure out the plotting orientation

ValueError: Could not interpret input 'vaf_group'
In [77]:
fig, ax = boxplot_sampletype(df=master_hotspot,
                  group='ccf_group',
                  palette={'low': tab10[0] , 'medium': tab10[1], 'high':tab10[2]},
                  order=['low', 'medium', 'high'],
                  metrics='frac_genome_altered',
                  figsize=(3,10),
                  title='Fraction of Genome Altered - CCF levels - {}'.format(cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_hotspot,
               group='ccf_group',
               metrics='frac_genome_altered',
               group_list=['low', 'medium'])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-77-0c5492f2e387> in <module>
      6                   figsize=(3,10),
      7                   title='Fraction of Genome Altered - CCF levels - {}'.format(cancer),
----> 8                   xlim=[0,1])
      9 plt.show()
     10 

<ipython-input-3-27302fc2b541> in boxplot_sampletype(df, group, palette, order, metrics, figsize, title, title_font, xlim)
     35     ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
     36 
---> 37     sns.boxplot(y=metrics, x=group,data=df,ax=ax, dodge=False,order=order, palette=palette).set_title(title, weight='bold', fontsize=title_font)
     38 
     39     groupby_ = get_groupby(df,group, 'count')

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/seaborn/categorical.py in boxplot(x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth, whis, ax, **kwargs)
   2239     plotter = _BoxPlotter(x, y, hue, data, order, hue_order,
   2240                           orient, color, palette, saturation,
-> 2241                           width, dodge, fliersize, linewidth)
   2242 
   2243     if ax is None:

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/seaborn/categorical.py in __init__(self, x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth)
    441                  width, dodge, fliersize, linewidth):
    442 
--> 443         self.establish_variables(x, y, hue, data, orient, order, hue_order)
    444         self.establish_colors(color, palette, saturation)
    445 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/seaborn/categorical.py in establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
    150                 if isinstance(var, str):
    151                     err = "Could not interpret input '{}'".format(var)
--> 152                     raise ValueError(err)
    153 
    154             # Figure out the plotting orientation

ValueError: Could not interpret input 'ccf_group'

Clinical Correlates

Age

In [78]:
#fig=plt.figure(figsize=(10,3))
ax = plt.subplot2grid(shape=(4,1), loc=(0,0), colspan=1)

sns.boxplot(x='Patient_Current_Age',data=master_no_wgd_cancer, ax=ax).set_title('Patient Age - {}'.format(cancer), weight='bold', fontsize=14)


style(ax)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

TP53 Residual Groups

In [79]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_res_group',
                  palette=palette_res,
                  order=res_group_list,
                  metrics='Patient_Current_Age',
                  figsize=(3,10),
                  title='Patient Current Age - {}'.format(cancer),
                  xlim=[20,100])
plt.show()

get_statistics(df=master_no_wgd_cancer,
               group='tp53_res_group',
               metrics='Patient_Current_Age',
               group_list=['tp53_res', 'no_tp53_res'])
Out[79]:
0 1 2
0 size Patient_Current_Age
1 tp53_res 808 58
2 no_tp53_res 561 54.5
3 Statistics p-value
4 4.33976 1.42636e-05

TP53 Subgroups

In [80]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='Patient_Current_Age',
                  figsize=(7,10),
                  title='Patient Current Age - {}'.format(cancer),
                  xlim=[20,100])
plt.show()

get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='Patient_Current_Age',
               group_list=['1_WILD_TYPE', '>=1_cnLOH'])
Out[80]:
0 1 2
0 size Patient_Current_Age
1 1_WILD_TYPE 23 55
2 >=1_cnLOH 64 55.5
3 Statistics p-value
4 0.341683 0.732589

Sex

In [81]:
h = get_groupby(master_no_wgd_cancer,'Sex', 'count').sort_values(by='count', ascending=False)
display(h)

h = h.T
h = h[['Male', 'Female']]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax)
ax.legend(['Male', 'Female'],loc='center left', bbox_to_anchor=(1.1, 0.5), fontsize=11)
ax.set_title('Sex Distribution - {} - No WGD'.format(cancer), weight='bold', fontsize=18)

plt.show()
count
Sex
Female 2307
Male 10

Survival Analysis

In [82]:
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
In [83]:
data = master_no_wgd_cancer.dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)

data = data[['tp53_group', 'tp53_res_group', 'Overall Survival Status 0/1', 'Overall_Survival_Months']]

ix1 = data['tp53_res_group'] == 'tp53_res'
ix2 = data['tp53_res_group'] == 'no_tp53_res'

T_exp, E_exp = data.loc[ix1, 'Overall_Survival_Months'], data.loc[ix1, 'Overall Survival Status 0/1']
T_con, E_con = data.loc[ix2, 'Overall_Survival_Months'], data.loc[ix2, 'Overall Survival Status 0/1']

results = logrank_test(T_exp, T_con, event_observed_A=E_exp, event_observed_B=E_con)
results.print_summary()
t_0 -1
null_distribution chi squared
degrees_of_freedom 1
test_name logrank_test
test_statistic p
0 44.02 <0.005
In [91]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - Non-WGD Cohort - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(res_group_list[:2], range(len(res_group_list[:2]))):
    data = master_no_wgd_cancer[master_no_wgd_cancer['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
    kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
    kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
plt.show()
In [89]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - WGD Cohort - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()


for group,i in zip(['bi', 'tp53_res'], range(2)):
    data = master_wgd_cancer[master_wgd_cancer['prewgd_tp53_group_1'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
    kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
    kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
plt.show()
In [90]:
from lifelines import KaplanMeierFitter

fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - Non-WGD Cohort - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(group_list, range(len(group_list))):
    data = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
    kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
    kmf.plot_survival_function(color = palette_list[i], ax=ax)
plt.show()

High CCF

In [93]:
master_high = master_ccf[(master_ccf['ccf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]
In [94]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - High CCF'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(res_group_list, range(len(res_group_list))):
    data = master_high[master_high['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
    except: pass
plt.show()


fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - High CCF'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(group_list, range(len(group_list))):
    data = master_high[master_high['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = palette_list[i], ax=ax)
    except: pass
plt.show()
In [ ]: